EDA and Text Preprocessing¶

Import module¶

In [29]:
# Basic
import re
import numpy as np
import pandas as pd
# Plotting
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# Machine Learning
from sklearn.model_selection import train_test_split
# Text preprocessing
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer
# Other
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')
In [1]:
import plotly
import plotly.io as pio
plotly.offline.init_notebook_mode(connected=True)
pio.renderers.default='notebook'

1: EDA¶

In [30]:
data = pd.read_csv('../data/Kaggle_MBTI.csv')
data.shape
Out[30]:
(8675, 2)
In [31]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8675 entries, 0 to 8674
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   type    8675 non-null   object
 1   posts   8675 non-null   object
dtypes: object(2)
memory usage: 135.7+ KB
In [32]:
data.head()
Out[32]:
type posts
0 INFJ 'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1 ENTP 'I'm finding the lack of me in these posts ver...
2 INTP 'Good one _____ https://www.youtube.com/wat...
3 INTJ 'Dear INTP, I enjoyed our conversation the o...
4 ENTJ 'You're fired.|||That's another silly misconce...
In [33]:
# Show some posts of the first user
user1post = [post.split('|||') for post in data.head(1).posts.values]  # list
user1post_random = pd.DataFrame(
	data=np.array(user1post).transpose(),
	columns=['post']).sample(10)
user1post_random.style.set_table_styles([
	{'selector': 'th', 'props': [('text-align', 'left')]},
	{'selector': 'td', 'props': [('text-align', 'left')]}
	], overwrite=False)
Out[33]:
post
27 http://www.youtube.com/watch?v=4V2uYORhQOk
16 It appears to be too late. :sad:
19 I just cherish the time of solitude b/c i revel within my inner world more whereas most other time i'd be workin... just enjoy the me time while you can. Don't worry, people will always be around to...
32 Banned for a whole host of reasons!
44 http://www.youtube.com/watch?v=w8IgImn57aQ
47 I failed a public speaking class a few years ago and I've sort of learned what I could do better were I to be in that position again. A big part of my failure was just overloading myself with too...
2 enfp and intj moments https://www.youtube.com/watch?v=iz7lE1g4XM4 sportscenter not top ten plays https://www.youtube.com/watch?v=uCdfze1etec pranks
10 http://playeressence.com/wp-content/uploads/2013/08/RED-red-the-pokemon-master-32560474-450-338.jpg Game. Set. Match.
26 http://www.youtube.com/watch?v=Mw7eoU3BMbE
5 May the PerC Experience immerse you.
In [34]:
# There are many records contains URL
search_URL = data.loc[data['posts'].str.contains("www", case=True)]
search_URL
Out[34]:
type posts
0 INFJ 'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1 ENTP 'I'm finding the lack of me in these posts ver...
2 INTP 'Good one _____ https://www.youtube.com/wat...
3 INTJ 'Dear INTP, I enjoyed our conversation the o...
4 ENTJ 'You're fired.|||That's another silly misconce...
... ... ...
8667 ENTP 'I think generally people experience post trau...
8669 INFJ 'I'm not sure about a method for picking out I...
8670 ISFP 'https://www.youtube.com/watch?v=t8edHB_h908||...
8672 INTP 'So many questions when i do these things. I ...
8674 INFP 'It has been too long since I have been on per...

4537 rows × 2 columns

In [36]:
color = px.colors.sequential.Sunset_r
df_count = data['type'].value_counts()
df_count = df_count.rename_axis('type').reset_index(name='counts')
fig = px.bar(df_count, x='type', y='counts',
color='type', color_discrete_sequence=color,
title='Type count',
width=1000,
height=600)
fig.show()
In [37]:
# Stratify split to ensure equal distribution of data
train_data, test_data = train_test_split(data,
	test_size=0.2,
	random_state=42,
	stratify=data.type)
In [38]:
train_data
Out[38]:
type posts
1228 INFP 'We are mandarin speakers. He receive educati...
1290 ISTP 'Nope. Not now, not ever. I'm too busy with ...
6756 ENFJ 'That's the only one I haven't gotten to read ...
1662 INFP 'I used to think that maturity was burning bri...
3338 INFP 'I get typed as both a 4w5 and 5w6 as well but...
... ... ...
7292 INFP Haven't posted here in a while. Here was my at...
1086 INFP 'Ok, I'll go first. I'm a 29 year old INFP mal...
7435 ENTJ 'I have dated a few INFJs, including my curren...
1843 INTP 'People who are unable to replace social norms...
2530 ENTP 'Yep! you're right! I agree with you!! i think...

6940 rows × 2 columns

In [39]:
df_train_count = train_data['type'].value_counts().rename_axis('type').reset_index(name='counts')
df_test_count = test_data['type'].value_counts().rename_axis('type').reset_index(name='counts')
In [40]:
df_train_count
Out[40]:
type counts
0 INFP 1465
1 INFJ 1176
2 INTP 1043
3 INTJ 873
4 ENTP 548
5 ENFP 540
6 ISTP 270
7 ISFP 217
8 ENTJ 185
9 ISTJ 164
10 ENFJ 152
11 ISFJ 133
12 ESTP 71
13 ESFP 38
14 ESFJ 34
15 ESTJ 31
In [41]:
# Create subplots: use 'domain' type for Pie subplot
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=df_train_count['type'], values=df_train_count['counts'], name='Train'),
              1, 1)
fig.add_trace(go.Pie(labels=df_test_count['type'], values=df_test_count['counts'], name='Test'),
              1, 2)

fig.update_traces(hole=.4, hovertemplate='Type: %{label}<br>Count: %{value}', marker_colors=color)
fig.update_layout(
    title_text = "Train_Test Split",
    annotations = [dict(text='Train', x=0.2, y=0.5, font_size=18, showarrow=False),
                 dict(text='Test', x=0.795, y=0.5, font_size=18, showarrow=False)])
fig.show()
In [42]:
# fig = px.pie(train_data, names='type', title='Train data', color_discrete_sequence=colors, hole=0.3, width=800, height=600 )
# fig.update_traces(textinfo='percent',hovertemplate='Type: %{label}<br>Count: %{value}')
# fig.show()
In [43]:
# fig = px.pie(test_data, names='type', title='Test data', color_discrete_sequence=colors, hole=0.3, width=800, height=600 )
# fig.update_traces(textinfo='percent',hovertemplate='Type: %{label}<br>Count: %{value}')
# fig.show()

2: Text Preprocessing¶

Todo:

  • 要考慮:目前有去除冒號跟井字號,但有些可能是 emoji 或 hashtag (例如:":happy:"、"#mood")
  • 目前是把超連結替換成 URL,也可以直接拔掉
  • 還沒做 stop words 包含 MBTI 的
  • 目前有把 porter stemmer 當參數之一,看要不要刪掉,也可以跟 SnowballStemmer 一起訓練,看結果有沒有差

2.1: Cleaning¶

  • 簡單的去除分隔符、超連結、符號、多餘的空格

getCleanPost()¶

In [44]:
def getCleanPost(text):
	text = re.sub(r'\|\|\|', r' ', text)  # Remove splitter
	text = re.sub(r'http\S+', r'URL', text)  # Replace hyperlinks with "URL"
	text = re.sub('[^0-9a-zA-Z]',' ', text)  # Keep only words
	text = re.sub(' +', ' ', text)  # Remove redundant space
	return text

Explanation¶

In [45]:
'''
Example of getCleanPost.
input: Top 520 words in data.posts[0]
output: getCleanPost(input)
'''
origi_sentence = data.posts[0][0:520]
clean_sentence = getCleanPost(origi_sentence)

print('\033[96mBefore cleaning:\n',origi_sentence,'\n')

print('\033[94mAfter cleaning:\n',clean_sentence)
Before cleaning:
 'http://www.youtube.com/watch?v=qsXHcwe3krw|||http://41.media.tumblr.com/tumblr_lfouy03PMA1qa1rooo1_500.jpg|||enfp and intj moments  https://www.youtube.com/watch?v=iz7lE1g4XM4  sportscenter not top ten plays  https://www.youtube.com/watch?v=uCdfze1etec  pranks|||What has been the most life-changing experience in your life?|||http://www.youtube.com/watch?v=vXZeYwwRDw8   http://www.youtube.com/watch?v=u8ejam5DP3E  On repeat for most of today.|||May the PerC Experience immerse you.|||The last thing my INFJ friend pos 

After cleaning:
  URL URL enfp and intj moments URL sportscenter not top ten plays URL pranks What has been the most life changing experience in your life URL URL On repeat for most of today May the PerC Experience immerse you The last thing my INFJ friend pos

Apply¶

In [46]:
# Apply getCleanPost to all training data
train_data_copy = train_data.copy()
tqdm.pandas()  # Progress bar
train_data_copy['posts_clean'] = train_data_copy['posts'].progress_apply(getCleanPost)
train_data_copy
  0%|          | 0/6940 [00:00<?, ?it/s]
Out[46]:
type posts posts_clean
1228 INFP 'We are mandarin speakers. He receive educati... We are mandarin speakers He receive education...
1290 ISTP 'Nope. Not now, not ever. I'm too busy with ... Nope Not now not ever I m too busy with work ...
6756 ENFJ 'That's the only one I haven't gotten to read ... That s the only one I haven t gotten to read ...
1662 INFP 'I used to think that maturity was burning bri... I used to think that maturity was burning bri...
3338 INFP 'I get typed as both a 4w5 and 5w6 as well but... I get typed as both a 4w5 and 5w6 as well but...
... ... ... ...
7292 INFP Haven't posted here in a while. Here was my at... Haven t posted here in a while Here was my att...
1086 INFP 'Ok, I'll go first. I'm a 29 year old INFP mal... Ok I ll go first I m a 29 year old INFP male ...
7435 ENTJ 'I have dated a few INFJs, including my curren... I have dated a few INFJs including my current...
1843 INTP 'People who are unable to replace social norms... People who are unable to replace social norms...
2530 ENTP 'Yep! you're right! I agree with you!! i think... Yep you re right I agree with you i think see...

6940 rows × 3 columns

2.2: Tokenization & Remove stop words¶

  • 轉小寫
  • 切詞
  • 移除 Stop words

getCleanToken()¶

In [47]:
# Stop word list
stop_words = stopwords.words('english')
print('Stop words\n',stop_words)
Stop words
 ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
In [48]:
def getCleanToken(text):
	# getCleanPost
	text = re.sub(r'\|\|\|', r' ', text)
	text = re.sub(r'http\S+', r'URL', text)
	text = re.sub('[^0-9a-zA-Z]',' ', text)
	text = re.sub(' +', ' ', text)
	# Add "Tokenization" and remove stopword
	text = text.lower()
	tokens = word_tokenize(text)
	filtered_tokens = [w for w in tokens if not w in stop_words]
	return filtered_tokens

Explanation¶

In [49]:
'''
Example of the added part in getCleanToken.
Referred to the paragraph # Add "Tokenization" and remove stopword
input: getCleanPost(user #1228)
output: getCleanToken(user #1228)
'''
clean_post = getCleanPost(train_data.posts[1228])

# Tokenization
tokens = word_tokenize(clean_post)
print(f'Original: {len(tokens)} tokens\n')

# Stop words
filtered_tokens = [w for w in tokens if not w in stop_words]
print(f'After removing stop words: {len(filtered_tokens)} tokens\n')

# Check removed words
print(f'Removed words: {list(set(tokens).difference(set(filtered_tokens)))}')
Original: 879 tokens

After removing stop words: 508 tokens

Removed words: ['up', 'which', 'each', 'and', 'will', 'about', 'only', 'him', 'no', 'yours', 'it', 'why', 'the', 'out', 'just', 'am', 's', 'from', 'who', 'or', 'with', 't', 'be', 'll', 'all', 'can', 'if', 'very', 'do', 'were', 'that', 'he', 'what', 'then', 'has', 'more', 'my', 'they', 'at', 'of', 'few', 'while', 'your', 'other', 'have', 'o', 'we', 'm', 'them', 'where', 'for', 'is', 'those', 'you', 'to', 'did', 'any', 'don', 'her', 'an', 'some', 'here', 'in', 'won', 'too', 'as', 'me', 'his', 'so', 'now', 'are', 'this', 'their', 'once', 'a', 'on', 'not', 'how', 'when', 'because']

Apply¶

In [50]:
# Apply getCleanToken to all training data
tqdm.pandas()  # Progress bar
train_data_copy['tokens_clean'] = train_data_copy['posts'].progress_apply(getCleanToken)
  0%|          | 0/6940 [00:00<?, ?it/s]
In [51]:
train_data_copy
Out[51]:
type posts posts_clean tokens_clean
1228 INFP 'We are mandarin speakers. He receive educati... We are mandarin speakers He receive education... [mandarin, speakers, receive, education, canad...
1290 ISTP 'Nope. Not now, not ever. I'm too busy with ... Nope Not now not ever I m too busy with work ... [nope, ever, busy, work, causes, adrenaline, r...
6756 ENFJ 'That's the only one I haven't gotten to read ... That s the only one I haven t gotten to read ... [one, gotten, read, yet, might, pick, one, boo...
1662 INFP 'I used to think that maturity was burning bri... I used to think that maturity was burning bri... [used, think, maturity, burning, bridges, with...
3338 INFP 'I get typed as both a 4w5 and 5w6 as well but... I get typed as both a 4w5 and 5w6 as well but... [get, typed, 4w5, 5w6, well, like, consider, 4...
... ... ... ... ...
7292 INFP Haven't posted here in a while. Here was my at... Haven t posted here in a while Here was my att... [posted, attire, best, man, buddies, wedding, ...
1086 INFP 'Ok, I'll go first. I'm a 29 year old INFP mal... Ok I ll go first I m a 29 year old INFP male ... [ok, go, first, 29, year, old, infp, male, int...
7435 ENTJ 'I have dated a few INFJs, including my curren... I have dated a few INFJs including my current... [dated, infjs, including, current, partner, 6,...
1843 INTP 'People who are unable to replace social norms... People who are unable to replace social norms... [people, unable, replace, social, norms, ratio...
2530 ENTP 'Yep! you're right! I agree with you!! i think... Yep you re right I agree with you i think see... [yep, right, agree, think, seeking, pressure, ...

6940 rows × 4 columns

Compare¶

In [52]:
# Statistics
train_data_copy['Words count after getCleanPost'] = train_data_copy['posts_clean'].apply(lambda n: len(n.split()))
train_data_copy['Words count after getCleanToken'] = train_data_copy['tokens_clean'].str.len()
train_data_copy
Out[52]:
type posts posts_clean tokens_clean Words count after getCleanPost Words count after getCleanToken
1228 INFP 'We are mandarin speakers. He receive educati... We are mandarin speakers He receive education... [mandarin, speakers, receive, education, canad... 879 444
1290 ISTP 'Nope. Not now, not ever. I'm too busy with ... Nope Not now not ever I m too busy with work ... [nope, ever, busy, work, causes, adrenaline, r... 1299 648
6756 ENFJ 'That's the only one I haven't gotten to read ... That s the only one I haven t gotten to read ... [one, gotten, read, yet, might, pick, one, boo... 1273 571
1662 INFP 'I used to think that maturity was burning bri... I used to think that maturity was burning bri... [used, think, maturity, burning, bridges, with... 1479 678
3338 INFP 'I get typed as both a 4w5 and 5w6 as well but... I get typed as both a 4w5 and 5w6 as well but... [get, typed, 4w5, 5w6, well, like, consider, 4... 1142 522
... ... ... ... ... ... ...
7292 INFP Haven't posted here in a while. Here was my at... Haven t posted here in a while Here was my att... [posted, attire, best, man, buddies, wedding, ... 653 331
1086 INFP 'Ok, I'll go first. I'm a 29 year old INFP mal... Ok I ll go first I m a 29 year old INFP male ... [ok, go, first, 29, year, old, infp, male, int... 1086 535
7435 ENTJ 'I have dated a few INFJs, including my curren... I have dated a few INFJs including my current... [dated, infjs, including, current, partner, 6,... 1367 683
1843 INTP 'People who are unable to replace social norms... People who are unable to replace social norms... [people, unable, replace, social, norms, ratio... 720 332
2530 ENTP 'Yep! you're right! I agree with you!! i think... Yep you re right I agree with you i think see... [yep, right, agree, think, seeking, pressure, ... 1751 787

6940 rows × 6 columns

In [53]:
train_data_copy.describe()
Out[53]:
Words count after getCleanPost Words count after getCleanToken
count 6940.000000 6940.000000
mean 1320.436888 618.429251
std 325.409982 140.097720
min 5.000000 4.000000
25% 1132.000000 539.000000
50% 1374.000000 643.000000
75% 1561.000000 721.000000
max 1998.000000 927.000000

2.3: Stemming and Lemmatization¶

  • 比較 PorterStemmer 與 SnowballStemmer 的結果
  • 用 WordNetLemmatizer 進行 Lemmatization

Preprocessor()

In [54]:
def Preprocessor(text, stemmer='Snowball'):
	# getCleanToken
	text = re.sub(r'\|\|\|', r' ', text)
	text = re.sub(r'http\S+', r'URL', text)
	text = re.sub('[^0-9a-zA-Z]',' ', text)
	text = re.sub(' +', ' ', text)
	text = text.lower()
	tokens = word_tokenize(text)
	filtered_tokens = [w for w in tokens if not w in stop_words]

	# Add "Stemming" and "Lemmatization"
	stemmer = SnowballStemmer("english")  # Initiate
	lemma = WordNetLemmatizer()  # Initiate
	stemmed = [stemmer.stem(t) for t in filtered_tokens]  # Stemming	
	lemmatized = [lemma.lemmatize(t) for t in stemmed]  # Lemmatization

	if stemmer == 'Porter':
		stemmer = PorterStemmer()
		stemmed = [stemmer_ps.stem(t) for t in filtered_tokens]
		lemmatized = [lemma.lemmatize(t) for t in stemmed]

	return lemmatized

Explanation¶

In [55]:
'''
Example of the added part in Preprocessor.
Referred to the paragraph: # Add "Stemming" and "Lemmatization"
input: getCleanToken(user #1228)
output: Preprocessor(user #1228)
'''
clean_token = getCleanToken(train_data.posts[1228])
# Initiate
stemmer_ps = PorterStemmer()
stemmer_ss = SnowballStemmer("english") 
lemma = WordNetLemmatizer()
# Stemming
stemmed_ps = [stemmer_ps.stem(t) for t in clean_token]
stemmed_ss = [stemmer_ss.stem(t) for t in clean_token]
# Lemmatizing
lemmatized_ps = [lemma.lemmatize(t) for t in stemmed_ps]
lemmatized_ss = [lemma.lemmatize(t) for t in stemmed_ss]

Compare different Stemmer¶

In [56]:
# Compare different Stemmer and Lemmatizer, which 'stle' stands for.
df_stle = pd.DataFrame(
          list(zip(clean_token, stemmed_ps, stemmed_ss, lemmatized_ps, lemmatized_ss)),
          columns =['Original(clean_token)', 'PorterStemmer', 'SnowballStemmer', 'Lemma with PorterStemmer', 'Lemma with SnowballStemmer']) 
df_stle.head(10)
Out[56]:
Original(clean_token) PorterStemmer SnowballStemmer Lemma with PorterStemmer Lemma with SnowballStemmer
0 mandarin mandarin mandarin mandarin mandarin
1 speakers speaker speaker speaker speaker
2 receive receiv receiv receiv receiv
3 education educ educ educ educ
4 canada canada canada canada canada
5 since sinc sinc sinc sinc
6 13 13 13 13 13
7 thanks thank thank thank thank
8 bellisaurius bellisauriu bellisaurius bellisauriu bellisaurius
9 appreciate appreci appreci appreci appreci
In [57]:
diff_result = df_stle.query('PorterStemmer != SnowballStemmer')
print(f'The PorterStemmer and SnowballStemmer has\
  {diff_result.shape[0]} / {df_stle.shape[0]}\
  different tokens in user #1228\'s posts.')
diff_result
The PorterStemmer and SnowballStemmer has  15 / 444  different tokens in user #1228's posts.
Out[57]:
Original(clean_token) PorterStemmer SnowballStemmer Lemma with PorterStemmer Lemma with SnowballStemmer
8 bellisaurius bellisauriu bellisaurius bellisauriu bellisaurius
10 kindly kindli kind kindli kind
41 yes ye yes ye yes
46 yes ye yes ye yes
157 yes ye yes ye yes
161 saurus sauru saurus sauru saurus
291 dos do dos do do
304 pros pro pros pro pro
318 exactly exactli exact exactli exact
382 dos do dos do do
387 dos do dos do do
399 dos do dos do do
409 dos do dos do do
414 dos do dos do do
422 communication commun communic commun communic

Apply¶

In [58]:
# Apply Preprocessor to all training data
tqdm.pandas()  # Progress bar
train_data_copy['preprocessed'] = train_data_copy['posts'].progress_apply(Preprocessor)
  0%|          | 0/6940 [00:00<?, ?it/s]

2.4: Result¶

Evolution¶

In [59]:
train_data_copy.drop(train_data_copy.columns[[4,5]],axis = 1)
Out[59]:
type posts posts_clean tokens_clean preprocessed
1228 INFP 'We are mandarin speakers. He receive educati... We are mandarin speakers He receive education... [mandarin, speakers, receive, education, canad... [mandarin, speaker, receiv, educ, canada, sinc...
1290 ISTP 'Nope. Not now, not ever. I'm too busy with ... Nope Not now not ever I m too busy with work ... [nope, ever, busy, work, causes, adrenaline, r... [nope, ever, busi, work, caus, adrenalin, rush...
6756 ENFJ 'That's the only one I haven't gotten to read ... That s the only one I haven t gotten to read ... [one, gotten, read, yet, might, pick, one, boo... [one, gotten, read, yet, might, pick, one, boo...
1662 INFP 'I used to think that maturity was burning bri... I used to think that maturity was burning bri... [used, think, maturity, burning, bridges, with... [use, think, matur, burn, bridg, without, seco...
3338 INFP 'I get typed as both a 4w5 and 5w6 as well but... I get typed as both a 4w5 and 5w6 as well but... [get, typed, 4w5, 5w6, well, like, consider, 4... [get, type, 4w5, 5w6, well, like, consid, 4w5,...
... ... ... ... ... ...
7292 INFP Haven't posted here in a while. Here was my at... Haven t posted here in a while Here was my att... [posted, attire, best, man, buddies, wedding, ... [post, attir, best, man, buddi, wed, 698410, u...
1086 INFP 'Ok, I'll go first. I'm a 29 year old INFP mal... Ok I ll go first I m a 29 year old INFP male ... [ok, go, first, 29, year, old, infp, male, int... [ok, go, first, 29, year, old, infp, male, int...
7435 ENTJ 'I have dated a few INFJs, including my curren... I have dated a few INFJs including my current... [dated, infjs, including, current, partner, 6,... [date, infj, includ, current, partner, 6, year...
1843 INTP 'People who are unable to replace social norms... People who are unable to replace social norms... [people, unable, replace, social, norms, ratio... [peopl, unabl, replac, social, norm, ration, e...
2530 ENTP 'Yep! you're right! I agree with you!! i think... Yep you re right I agree with you i think see... [yep, right, agree, think, seeking, pressure, ... [yep, right, agre, think, seek, pressur, relev...

6940 rows × 5 columns

Random example¶

In [60]:
print(f'Input (800 words):\n{train_data.posts[0][:800]}...')
Input (800 words):
'http://www.youtube.com/watch?v=qsXHcwe3krw|||http://41.media.tumblr.com/tumblr_lfouy03PMA1qa1rooo1_500.jpg|||enfp and intj moments  https://www.youtube.com/watch?v=iz7lE1g4XM4  sportscenter not top ten plays  https://www.youtube.com/watch?v=uCdfze1etec  pranks|||What has been the most life-changing experience in your life?|||http://www.youtube.com/watch?v=vXZeYwwRDw8   http://www.youtube.com/watch?v=u8ejam5DP3E  On repeat for most of today.|||May the PerC Experience immerse you.|||The last thing my INFJ friend posted on his facebook before committing suicide the next day. Rest in peace~   http://vimeo.com/22842206|||Hello ENFJ7. Sorry to hear of your distress. It's only natural for a relationship to not be perfection all the time in every moment of existence. Try to figure the hard times ...
In [61]:
print(f'Output:\n{Preprocessor(train_data.posts[0])}')
Output:
['url', 'url', 'enfp', 'intj', 'moment', 'url', 'sportscent', 'top', 'ten', 'play', 'url', 'prank', 'life', 'chang', 'experi', 'life', 'url', 'url', 'repeat', 'today', 'may', 'perc', 'experi', 'immers', 'last', 'thing', 'infj', 'friend', 'post', 'facebook', 'commit', 'suicid', 'next', 'day', 'rest', 'peac', 'url', 'hello', 'enfj7', 'sorri', 'hear', 'distress', 'natur', 'relationship', 'perfect', 'time', 'everi', 'moment', 'exist', 'tri', 'figur', 'hard', 'time', 'time', 'growth', '84389', '84390', 'url', 'url', 'welcom', 'stuff', 'url', 'game', 'set', 'match', 'prozac', 'wellbrutin', 'least', 'thirti', 'minut', 'move', 'leg', 'mean', 'move', 'sit', 'desk', 'chair', 'weed', 'moder', 'mayb', 'tri', 'edibl', 'healthier', 'altern', 'basic', 'come', 'three', 'item', 'determin', 'type', 'whichev', 'type', 'want', 'would', 'like', 'use', 'given', 'type', 'cognit', 'function', 'whatnot', 'left', 'thing', 'moder', 'sim', 'inde', 'video', 'game', 'good', 'one', 'note', 'good', 'one', 'somewhat', 'subject', 'complet', 'promot', 'death', 'given', 'sim', 'dear', 'enfp', 'favorit', 'video', 'game', 'grow', 'current', 'favorit', 'video', 'game', 'cool', 'url', 'appear', 'late', 'sad', 'someon', 'everyon', 'wait', 'thought', 'confid', 'good', 'thing', 'cherish', 'time', 'solitud', 'b', 'c', 'revel', 'within', 'inner', 'world', 'wherea', 'time', 'workin', 'enjoy', 'time', 'worri', 'peopl', 'alway', 'around', 'yo', 'entp', 'ladi', 'complimentari', 'person', 'well', 'hey', 'main', 'social', 'outlet', 'xbox', 'live', 'convers', 'even', 'verbal', 'fatigu', 'quick', 'url', 'realli', 'dig', 'part', '1', '46', '2', '50', 'url', 'ban', 'thread', 'requir', 'get', 'high', 'backyard', 'roast', 'eat', 'marshmellow', 'backyard', 'convers', 'someth', 'intellectu', 'follow', 'massag', 'kiss', 'url', 'url', 'url', 'ban', 'mani', 'b', 'sentenc', 'could', 'think', 'b', 'ban', 'watch', 'movi', 'corner', 'dunc', 'ban', 'health', 'class', 'clear', 'taught', 'noth', 'peer', 'pressur', 'ban', 'whole', 'host', 'reason', 'url', '1', 'two', 'babi', 'deer', 'left', 'right', 'munch', 'beetl', 'middl', '2', 'use', 'blood', 'two', 'caveman', 'diari', 'today', 'latest', 'happen', 'design', 'cave', 'diari', 'wall', '3', 'see', 'pokemon', 'world', 'infj', 'societi', 'everyon', 'becom', 'optimist', '49142', 'url', 'url', 'url', 'url', 'artist', 'artist', 'draw', 'idea', 'count', 'form', 'someth', 'like', 'signatur', 'welcom', 'robot', 'rank', 'person', 'down', 'self', 'esteem', 'cuz', 'avid', 'signatur', 'artist', 'like', 'proud', 'ban', 'take', 'room', 'bed', 'ya', 'got', 'ta', 'learn', 'share', 'roach', 'url', 'ban', 'much', 'thunder', 'grumbl', 'kind', 'storm', 'yep', 'ahh', 'old', 'high', 'school', 'music', 'heard', 'age', 'url', 'fail', 'public', 'speak', 'class', 'year', 'ago', 'sort', 'learn', 'could', 'better', 'posit', 'big', 'part', 'failur', 'overload', 'like', 'person', 'mental', 'confirm', 'intj', 'way', 'url', 'move', 'denver', 'area', 'start', 'new', 'life']
In [ ]: